Challenge 9: Baby Names

Author

Elizabeth Scholtes

Parts 1 & 2: The Data

library(tidyverse)
names <- read_csv(here::here("Labs", "Lab 9", "StateNames_A.csv"))

Part 3: Summarizing & Visualizing the Number of Allisons

Question 1

# Rename column as Sex
names <- rename(names, "Sex" = "Gender")

Challenge Part 2 Section 2

library(DT)
datatable(names)
allison_names <- names |>
  group_by(State, Sex) |>
  filter(Name == "Allison") |>
  summarize(Count = sum(Count)) |>
  pivot_wider(names_from = Sex,
              values_from = Count) |>
  mutate(
    M = coalesce(M, 0)
  )
allison_names
# A tibble: 51 × 3
# Groups:   State [51]
   State     F     M
   <chr> <dbl> <dbl>
 1 AK      232     0
 2 AL     1535     0
 3 AR     1198     0
 4 AZ     1880     0
 5 CA    12413     0
 6 CO     1594     0
 7 CT     1099     0
 8 DC      321     0
 9 DE      294     0
10 FL     4455     0
# … with 41 more rows

Question 2

allison_F_names <- names |>
  filter(Name == "Allison",
         Sex == "F")

Question 3

# Create data frame that can be used in future questions
names_by_year <- allison_F_names |>
  group_by(Year) |>
  summarize(Count = sum(Count))

names_by_year |>
  ggplot(mapping = aes(x = Year,
                       y = Count)) +
  geom_col() +
  labs(title = "Number of people assigned female at birth with the name Allison")

Part 4: Modeling the Number of Allisons

Question 4

allison_lm <- names_by_year |>
  lm(Count ~ Year, data = _)
allison_lm

Call:
lm(formula = Count ~ Year, data = names_by_year)

Coefficients:
(Intercept)         Year  
   209689.8       -101.5  

Question 5

names_by_year |>
  ggplot(aes(x = Year,
             y = Count)) +
  geom_point() +
  stat_smooth(method = "lm")
`geom_smooth()` using formula 'y ~ x'

Question 6

Estimated Regression Equation:

Count(hat) = 209689.8 - 101.5*Year

Question 7

allison_lm |>
  broom::augment() |>
  ggplot(mapping = aes(x = .resid,
                       y = Count)) +
  geom_point() +
  labs(x = "Residuals")

It looks like there is a pattern with the residuals in that the residual value gets higher as the number of “Allisons” per year gets higher. This is not ideal because that means that the residuals are not random and do not necessarily satisfy linear model conditions

Question 8

From this model, we can conclude that the name Allison is becoming less and less popular. Maybe in terms of how often the name is used it’s less “cool”, but that’s not necessarily true!

Part 5: Spelling by State

Question 1

allan_names <- names |>
  filter(Name == "Allan" | Name == "Alan" | Name == "Allen",
         Sex == "M")

Question 2

statenames <- allan_names |>
  pivot_wider(names_from = Name, values_from = Count) |>
  filter(Year == 2000,
         State == "CA" | State == "PA") |>
  select(State, Allen, Alan, Allan) |>
  mutate(
    Allen = coalesce(Allen, 0),
    Alan = coalesce(Alan, 0),
    Allan = coalesce(Allan, 0)
  )
statenames
# A tibble: 2 × 4
  State Allen  Alan Allan
  <chr> <dbl> <dbl> <dbl>
1 CA      176   579   131
2 PA       56    51    12

Question 3

percents <- statenames |>
  rowwise()|>
  mutate(
    total_sum = sum(c_across(Allen : Allan)),
    Allen = Allen / total_sum,
    Alan = Alan / total_sum,
    Allan = Allan / total_sum
    ) |>
  select(State:Allan)
percents
# A tibble: 2 × 4
# Rowwise: 
  State Allen  Alan Allan
  <chr> <dbl> <dbl> <dbl>
1 CA    0.199 0.653 0.148
2 PA    0.471 0.429 0.101

Challenge: Creating Nice Tables

Part 1 and Step 1 of Part 2

library(kableExtra)
# Table of Allisons by State
knitr::kable(allison_names,
             format = "html",
             col.names = c("State",
                           "Sex assigned at Birth",
                           "Number of People with Name Allison"),
             booktabs = TRUE) |>
  kable_styling(latex_options = "scale_down")
State Sex assigned at Birth Number of People with Name Allison
AK 232 0
AL 1535 0
AR 1198 0
AZ 1880 0
CA 12413 0
CO 1594 0
CT 1099 0
DC 321 0
DE 294 0
FL 4455 0
GA 3257 0
HI 183 0
IA 1477 0
ID 451 0
IL 5110 0
IN 3067 0
KS 1283 0
KY 1905 20
LA 1209 0
MA 2218 0
MD 2229 0
ME 340 0
MI 4014 0
MN 2374 0
MO 2882 0
MS 817 0
MT 226 0
NC 3435 0
ND 285 0
NE 807 0
NH 412 0
NJ 3052 0
NM 399 0
NV 729 0
NY 5747 0
OH 5487 0
OK 1421 0
OR 1186 0
PA 4307 0
RI 306 0
SC 1228 0
SD 376 0
TN 2488 0
TX 10192 0
UT 1125 0
VA 3220 0
VT 135 0
WA 1956 0
WI 2367 0
WV 813 0
WY 142 0
# Allan names raw counts
knitr::kable(statenames,
              format = "html",
             caption = "Number of 'Alans' by State",
             booktabs = TRUE) |>
  add_header_above(c("Location" = 1, "Name" = 3))
Number of 'Alans' by State
Location
Name
State Allen Alan Allan
CA 176 579 131
PA 56 51 12
# Allan names by percent
knitr::kable(percents,
             format = "html",
             caption = "Percentage of 'Alans' by State",
             booktabs = TRUE) |>
  kable_styling(font_size = 18)
Percentage of 'Alans' by State
State Allen Alan Allan
CA 0.1986456 0.6534989 0.1478555
PA 0.4705882 0.4285714 0.1008403

Part 2 - Step 2 At Top of Lab